full_data <- readRDS('data/full_data.rds')
daily_full <- readRDS('data/daily_full.rds')

Explore some GAM models

Five minute H2S

h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.173e+00  3.209e-03  365.68   <2e-16 ***
## year2023       -3.275e-01  2.799e-03 -117.01   <2e-16 ***
## wd_secQ2       -2.566e-01  3.748e-03  -68.47   <2e-16 ***
## wd_secQ3       -2.906e-01  3.889e-03  -74.74   <2e-16 ***
## wd_secQ4       -2.103e-01  3.489e-03  -60.27   <2e-16 ***
## ws             -5.429e-02  4.174e-04 -130.06   <2e-16 ***
## I(1/MinDist^2)  2.115e+05  2.329e+03   90.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.996      8 2499  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0948   Deviance explained = 9.48%
## GCV = 0.92955  Scale est. = 0.92953   n = 772917
plot(h2s_model_a)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4572112 244.2    9110970  486.6   6745969  360.3
## Vcells 104317198 795.9  296867597 2265.0 246918093 1883.9
h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
                   s(mon_utm_x, mon_utm_y, bs='tp', k = 8), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) + 
##     s(mon_utm_x, mon_utm_y, bs = "tp", k = 8)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.014e+00  3.167e-03  320.23   <2e-16 ***
## wd_secQ2       -1.962e-01  3.692e-03  -53.14   <2e-16 ***
## wd_secQ3       -1.830e-01  3.900e-03  -46.92   <2e-16 ***
## wd_secQ4       -1.139e-01  3.482e-03  -32.71   <2e-16 ***
## ws             -6.928e-02  4.144e-04 -167.17   <2e-16 ***
## I(1/MinDist^2)  2.989e+05  2.856e+03  104.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.999      8 1711  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 6.999      7 6597  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.131   Deviance explained = 13.1%
## GCV = 0.89269  Scale est. = 0.89267   n = 772917
plot(h2s_model_b)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4585792 245.0    9110970  486.6   6745969  360.3
## Vcells 114842767 876.2  296867597 2265.0 296520206 2262.3
# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws + 
                     I(1/MinDist^2) + Converted_Angle + 
                     s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
                     as.factor(weekday), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Converted_Angle + s(mon_utm_x, mon_utm_y, 
##     bs = "tp", k = 10) + as.factor(weekday)
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.496e+00  2.319e-02  107.606  < 2e-16 ***
## year2023             -3.332e-01  2.865e-03 -116.278  < 2e-16 ***
## wd_secQ2             -1.871e-01  3.641e-03  -51.379  < 2e-16 ***
## wd_secQ3             -1.928e-01  3.843e-03  -50.165  < 2e-16 ***
## wd_secQ4             -1.104e-01  3.439e-03  -32.108  < 2e-16 ***
## ws                   -6.128e-02  4.078e-04 -150.288  < 2e-16 ***
## I(1/MinDist^2)       -3.307e-05  3.303e-07 -100.112  < 2e-16 ***
## Converted_Angle      -6.129e-03  1.103e-04  -55.545  < 2e-16 ***
## as.factor(weekday).L  9.819e-02  2.801e-03   35.048  < 2e-16 ***
## as.factor(weekday).Q -1.708e-01  2.808e-03  -60.809  < 2e-16 ***
## as.factor(weekday).C -2.851e-03  2.803e-03   -1.017    0.309    
## as.factor(weekday)^4 -2.022e-02  2.815e-03   -7.185 6.72e-13 ***
## as.factor(weekday)^5 -5.940e-02  2.806e-03  -21.172  < 2e-16 ***
## as.factor(weekday)^6 -2.384e-02  2.819e-03   -8.457  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.998      8 2708  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.999      9 6521  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 30/31
## R-sq.(adj) =  0.156   Deviance explained = 15.6%
## GCV = 0.86676  Scale est. = 0.86672   n = 772917
plot(h2s_model_c)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4587653 245.1    9110970  486.6   6745969  360.3
## Vcells 126876717 968.0  427665339 3262.9 388282572 2962.4
# Include converted angle and weekday
h2s_model_d <- gam(H2S ~ 
                     s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                     wd_sec + ws + downwind_ref + downwind_wrp +
                     I(1/MinDist^2) + dist_wrp + capacity +
                     Converted_Angle + 
                     s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
                     monthly_oil_1km + monthly_gas_1km + active_1km
                     , 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     dist_wrp + capacity + Converted_Angle + s(mon_utm_x, mon_utm_y, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          -2.642e+00  5.385e-02  -49.056  < 2e-16 ***
## year2023             -1.044e-01  6.558e-03  -15.914  < 2e-16 ***
## as.factor(weekday).L  1.077e-01  2.988e-03   36.055  < 2e-16 ***
## as.factor(weekday).Q -1.851e-01  2.993e-03  -61.867  < 2e-16 ***
## as.factor(weekday).C  1.469e-04  2.978e-03    0.049  0.96066    
## as.factor(weekday)^4 -1.439e-02  2.982e-03   -4.824 1.41e-06 ***
## as.factor(weekday)^5 -6.110e-02  2.971e-03  -20.566  < 2e-16 ***
## as.factor(weekday)^6 -2.740e-02  2.982e-03   -9.188  < 2e-16 ***
## wd_secQ2             -1.532e-01  3.930e-03  -38.984  < 2e-16 ***
## wd_secQ3             -1.332e-01  4.153e-03  -32.078  < 2e-16 ***
## wd_secQ4             -7.319e-02  3.671e-03  -19.937  < 2e-16 ***
## ws                   -7.080e-02  4.332e-04 -163.434  < 2e-16 ***
## downwind_ref          1.429e-01  4.128e-03   34.614  < 2e-16 ***
## downwind_wrp         -1.389e-02  4.573e-03   -3.038  0.00238 ** 
## I(1/MinDist^2)        3.304e-05  1.031e-06   32.048  < 2e-16 ***
## dist_wrp              4.645e-04  9.171e-06   50.643  < 2e-16 ***
## capacity              5.504e-03  8.696e-05   63.294  < 2e-16 ***
## Converted_Angle      -2.974e-03  1.184e-04  -25.115  < 2e-16 ***
## monthly_oil_1km       6.487e-05  3.240e-06   20.023  < 2e-16 ***
## monthly_gas_1km       2.866e-04  1.230e-05   23.298  < 2e-16 ***
## active_1km           -3.712e-02  1.334e-03  -27.827  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.997  8.000 1017  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.007  8.007 5214  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 36/38
## R-sq.(adj) =  0.162   Deviance explained = 16.2%
## GCV = 0.89963  Scale est. = 0.89958   n = 712259
plot(h2s_model_d)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4588548  245.1    9110970  486.6   6745969  360.3
## Vcells 141948582 1083.0  427665339 3262.9 427590071 3262.3
# Include elvation, EVI, 3D smooth, odor, dist to dom channel, temp, hum, precip
h2s_model_e <- readRDS('rfiles/h2s_model_e.rds')
# h2s_model_e <- gam(H2S ~ 
#                      s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
#                      wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
#                      , 
#                    data = full_data %>% filter(day >= '2022-02-01'))
#saveRDS(h2s_model_e, 'rfiles/h2s_model_e.rds')
summary(h2s_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.285e+00  6.510e-01    3.510 0.000449 ***
## year2023              9.870e-02  1.712e-02    5.766 8.12e-09 ***
## as.factor(weekday).L  1.129e-01  2.852e-03   39.594  < 2e-16 ***
## as.factor(weekday).Q -1.747e-01  2.841e-03  -61.484  < 2e-16 ***
## as.factor(weekday).C  1.584e-02  2.824e-03    5.609 2.03e-08 ***
## as.factor(weekday)^4 -1.810e-02  2.828e-03   -6.400 1.55e-10 ***
## as.factor(weekday)^5 -5.583e-02  2.808e-03  -19.883  < 2e-16 ***
## as.factor(weekday)^6 -2.485e-02  2.822e-03   -8.806  < 2e-16 ***
## wd_secQ2             -1.031e-01  3.741e-03  -27.569  < 2e-16 ***
## wd_secQ3             -1.454e-01  3.962e-03  -36.702  < 2e-16 ***
## wd_secQ4             -1.146e-01  3.490e-03  -32.841  < 2e-16 ***
## ws                   -6.036e-02  4.158e-04 -145.169  < 2e-16 ***
## downwind_ref          1.098e-01  3.923e-03   28.001  < 2e-16 ***
## downwind_wrp          8.079e-03  4.326e-03    1.868 0.061802 .  
## I(1/MinDist^2)        1.707e-05  6.264e-06    2.726 0.006420 ** 
## I(1/dist_wrp^2)       1.056e-06  3.794e-07    2.783 0.005394 ** 
## capacity              7.434e-03  1.338e-03    5.556 2.75e-08 ***
## wrp_angle             4.063e-03  9.670e-04    4.201 2.65e-05 ***
## Converted_Angle      -1.586e-02  2.157e-03   -7.352 1.95e-13 ***
## elevation            -2.059e-01  1.293e-02  -15.920  < 2e-16 ***
## EVI                   2.975e+00  6.113e-01    4.868 1.13e-06 ***
## monthly_oil_1km       1.856e-04  6.710e-06   27.663  < 2e-16 ***
## monthly_gas_1km      -4.097e-04  3.277e-05  -12.504  < 2e-16 ***
## active_1km           -3.057e-02  2.530e-03  -12.081  < 2e-16 ***
## num_odor_complaints   2.821e-02  1.917e-03   14.718  < 2e-16 ***
## I(1/dist_dc^2)        1.019e-04  9.015e-06   11.303  < 2e-16 ***
## avg_temp              9.080e-03  4.090e-04   22.202  < 2e-16 ***
## avg_hum              -7.505e-03  1.056e-04  -71.073  < 2e-16 ***
## precip               -8.285e-02  5.145e-03  -16.102  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df       F
## s(as.numeric(month))                                     7.994  8.000 310.126
## s(mon_utm_x,mon_utm_y)                                   1.865  1.865   0.459
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 81.249 81.253 899.338
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(mon_utm_x,mon_utm_y)                                    0.542    
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/135
## R-sq.(adj) =  0.255   Deviance explained = 25.5%
## GCV = 0.80029  Scale est. = 0.80016   n = 712259
plot(h2s_model_e)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4596311  245.5    9110970  486.6   6745969  360.3
## Vcells 180079671 1373.9  427665339 3262.9 427590071 3262.3
# Model only the month of the event, October 2021
h2s_model_f <- readRDS('rfiles/h2s_model_f.rds')
# h2s_model_f <- gam(H2S ~ wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
#                    data = full_data %>% filter(year == '2021', month == '10'))
# saveRDS(h2s_model_f, 'rfiles/h2s_model_f.rds')
summary(h2s_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.099e-07  4.542e-07   2.003  0.04516 *  
## wd_secQ2             3.129e+00  9.512e-01   3.289  0.00101 ** 
## wd_secQ3             6.611e+00  9.439e-01   7.004 2.50e-12 ***
## wd_secQ4             7.882e+00  8.910e-01   8.845  < 2e-16 ***
## ws                  -2.321e+00  1.159e-01 -20.026  < 2e-16 ***
## downwind_ref         1.639e+00  8.589e-01   1.908  0.05642 .  
## downwind_wrp         1.494e+00  1.077e+00   1.388  0.16529    
## I(1/MinDist^2)      -4.561e-03  1.475e-03  -3.093  0.00198 ** 
## I(1/dist_wrp^2)     -9.325e-04  5.499e-05 -16.959  < 2e-16 ***
## capacity             7.671e-01  1.692e-01   4.533 5.82e-06 ***
## wrp_angle            8.449e-01  1.825e-01   4.629 3.68e-06 ***
## Converted_Angle     -2.556e+00  3.095e-01  -8.258  < 2e-16 ***
## elevation           -2.652e+01  2.679e+00  -9.899  < 2e-16 ***
## EVI                  8.337e+02  1.155e+02   7.220 5.23e-13 ***
## monthly_oil_1km      1.533e-02  7.959e-03   1.926  0.05412 .  
## monthly_gas_1km      2.295e-03  1.205e-03   1.905  0.05680 .  
## active_1km           3.981e-05  2.000e-05   1.991  0.04647 *  
## num_odor_complaints  8.458e-01  6.767e-02  12.498  < 2e-16 ***
## I(1/dist_dc^2)       7.748e+00  3.991e-01  19.415  < 2e-16 ***
## avg_temp             1.349e+00  1.864e-01   7.236 4.66e-13 ***
## avg_hum              1.629e-01  5.258e-02   3.099  0.00194 ** 
## precip              -4.019e+01  7.628e+00  -5.269 1.37e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(mon_utm_x,mon_utm_y)                                   2.017   2.02  4.625
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 84.337  84.58 61.960
##                                                         p-value    
## s(mon_utm_x,mon_utm_y)                                  0.00868 ** 
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 108/120
## R-sq.(adj) =  0.265   Deviance explained = 26.5%
## GCV = 5448.6  Scale est. = 5441.4    n = 77510
plot(h2s_model_f)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4597834  245.6    9110970  486.6   6745969  360.3
## Vcells 184066073 1404.4  427665339 3262.9 427590071 3262.3
# Model data excluding the months of the event
h2s_model_g <- readRDS('rfiles/h2s_model_g.rds')
# h2s_model_g <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
#                      wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
#                      , 
#                    data = full_data %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
# saveRDS(h2s_model_g, 'rfiles/h2s_model_g.rds')
summary(h2s_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           5.539e+00  3.610e-01   15.343  < 2e-16 ***
## year2021              3.707e-01  1.375e-02   26.967  < 2e-16 ***
## year2022             -5.225e-01  2.573e-02  -20.307  < 2e-16 ***
## year2023             -6.492e-01  2.547e-02  -25.492  < 2e-16 ***
## as.factor(weekday).L  8.394e-02  2.184e-03   38.426  < 2e-16 ***
## as.factor(weekday).Q -1.356e-01  2.182e-03  -62.151  < 2e-16 ***
## as.factor(weekday).C  1.406e-03  2.184e-03    0.644   0.5197    
## as.factor(weekday)^4 -2.889e-02  2.188e-03  -13.204  < 2e-16 ***
## as.factor(weekday)^5 -1.732e-02  2.181e-03   -7.943 1.98e-15 ***
## as.factor(weekday)^6 -5.050e-03  2.184e-03   -2.312   0.0208 *  
## wd_secQ2             -1.049e-01  2.952e-03  -35.543  < 2e-16 ***
## wd_secQ3             -1.878e-01  2.958e-03  -63.463  < 2e-16 ***
## wd_secQ4             -1.651e-01  2.796e-03  -59.057  < 2e-16 ***
## ws                   -4.613e-02  3.094e-04 -149.116  < 2e-16 ***
## downwind_ref          3.540e-02  2.614e-03   13.545  < 2e-16 ***
## downwind_wrp          2.151e-02  3.270e-03    6.577 4.80e-11 ***
## I(1/MinDist^2)       -4.864e-05  8.792e-06   -5.533 3.15e-08 ***
## I(1/dist_wrp^2)       1.483e-06  9.456e-07    1.568   0.1168    
## capacity              5.414e-03  5.868e-04    9.225  < 2e-16 ***
## wrp_angle             3.176e-03  7.460e-04    4.258 2.06e-05 ***
## Converted_Angle      -1.473e-02  1.020e-03  -14.451  < 2e-16 ***
## elevation            -1.200e-01  9.824e-03  -12.215  < 2e-16 ***
## EVI                   2.459e+00  4.676e-01    5.259 1.44e-07 ***
## monthly_oil_1km      -2.575e-05  2.232e-06  -11.534  < 2e-16 ***
## monthly_gas_1km      -1.831e-04  9.137e-06  -20.036  < 2e-16 ***
## active_1km           -3.881e-02  9.430e-04  -41.155  < 2e-16 ***
## num_odor_complaints   2.632e-02  1.466e-03   17.949  < 2e-16 ***
## I(1/dist_dc^2)        2.766e-03  3.971e-03    0.696   0.4861    
## avg_temp              6.183e-03  2.696e-04   22.937  < 2e-16 ***
## avg_hum              -6.301e-03  7.620e-05  -82.688  < 2e-16 ***
## precip               -7.811e-02  5.252e-03  -14.872  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.999  8.000 2547.0
## s(mon_utm_x,mon_utm_y)                                   1.682  1.683  246.9
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 85.198 85.296 1455.0
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(mon_utm_x,mon_utm_y)                                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 129/137
## R-sq.(adj) =  0.166   Deviance explained = 16.6%
## GCV = 1.1485  Scale est. = 1.1484    n = 1696814
plot(h2s_model_g)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4600310  245.7    9110970  486.6   6745969  360.3
## Vcells 274357720 2093.2  427665339 3262.9 427590071 3262.3

Daily max H2S

# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
                        I(1/MinDist^2), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg + 
##     ws_avg + I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.020e+00  3.464e-01   8.718   <2e-16 ***
## year2023       -4.317e-01  2.195e-01  -1.967   0.0493 *  
## wd_avg          6.905e-04  1.032e-03   0.669   0.5034    
## ws_avg         -8.775e-02  6.334e-02  -1.385   0.1661    
## I(1/MinDist^2)  7.628e-07  8.197e-08   9.306   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F  p-value    
## s(as.numeric(month)) 2.185      8 2.019 6.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 12/13
## R-sq.(adj) =  0.0087   Deviance explained = 1.06%
## GCV = 21.435  Scale est. = 21.386    n = 2667
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp", 
##     k = 10)
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       9.409e+00  1.421e+01   0.662  0.50803    
## wd_avg                            3.025e-03  1.007e-03   3.003  0.00269 ** 
## ws_avg                           -3.111e-01  6.322e-02  -4.921 9.14e-07 ***
## I(1/MinDist^2)                   -6.196e-05  1.642e-03  -0.038  0.96991    
## RefineryMarathon (Carson)        -8.957e-01  1.763e+01  -0.051  0.95948    
## RefineryMarathon (Wilmington)    -3.725e+00  1.747e+01  -0.213  0.83120    
## RefineryPhillips 66 (Wilmington) -1.409e+01  1.628e+01  -0.865  0.38708    
## RefineryTorrance Refinery        -2.584e+00  1.402e+01  -0.184  0.85377    
## RefineryValero Refinery          -7.558e+00  1.768e+01  -0.428  0.66901    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F  p-value    
## s(as.numeric(month))       2.036  8.000 1.725 0.000201 ***
## s(monitor_lat,monitor_lon) 5.380  5.673 9.290 8.68e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 25/26
## R-sq.(adj) =  0.127   Deviance explained = 13.2%
## GCV = 18.946  Scale est. = 18.836    n = 2667
plot(h2s_dm_model_b)

# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Converted_Angle+
                        s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.607e+00  8.814e-01   4.092  4.4e-05 ***
## wd_avg           2.917e-03  1.011e-03   2.884  0.00395 ** 
## ws_avg          -2.238e-01  6.195e-02  -3.612  0.00031 ***
## I(1/MinDist^2)  -8.262e-06  3.830e-05  -0.216  0.82923    
## Converted_Angle -3.430e-03  3.891e-03  -0.882  0.37805    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F  p-value    
## s(as.numeric(month))       2.114  8.000  2.046 4.81e-05 ***
## s(monitor_lat,monitor_lon) 7.081  7.926 41.174  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 21/22
## R-sq.(adj) =  0.117   Deviance explained = 12.1%
## GCV = 19.148  Scale est. = 19.053    n = 2667
plot(h2s_dm_model_c)

h2s_dm_model_d <- gam(H2S_daily_max ~ 
                         s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + dist_wrp + capacity +
                         Converted_Angle + 
                         s(monitor_lat, monitor_lon, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km
                      , 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + dist_wrp + 
##     capacity + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.136e+01  4.634e+00  -2.452 0.014273 *  
## year2023             -4.471e-01  4.297e-01  -1.040 0.298239    
## as.factor(weekday).L  4.596e-01  2.417e-01   1.901 0.057365 .  
## as.factor(weekday).Q -9.234e-01  2.421e-01  -3.814 0.000140 ***
## as.factor(weekday).C -8.155e-03  2.404e-01  -0.034 0.972936    
## as.factor(weekday)^4 -1.166e-01  2.399e-01  -0.486 0.627170    
## as.factor(weekday)^5 -3.736e-01  2.388e-01  -1.564 0.117844    
## as.factor(weekday)^6 -1.931e-01  2.394e-01  -0.807 0.419866    
## wd_avg                3.799e-03  1.098e-03   3.459 0.000551 ***
## ws_avg               -3.497e-01  6.974e-02  -5.014 5.73e-07 ***
## daily_downwind_ref    7.552e-01  4.163e-01   1.814 0.069803 .  
## I(1/MinDist^2)        1.152e-04  4.177e-04   0.276 0.782730    
## dist_wrp              2.158e-03  7.129e-04   3.027 0.002500 ** 
## capacity              9.067e-03  1.252e-02   0.724 0.469156    
## Converted_Angle       6.465e-03  9.437e-03   0.685 0.493379    
## monthly_oil_1km      -5.757e-05  1.609e-04  -0.358 0.720447    
## monthly_gas_1km      -2.126e-04  6.315e-04  -0.337 0.736385    
## active_1km            4.756e-03  4.832e-02   0.098 0.921606    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F p-value    
## s(as.numeric(month))       1.721  8.000  0.912 0.00637 ** 
## s(monitor_lat,monitor_lon) 7.785  7.976 13.083 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 34/35
## R-sq.(adj) =  0.137   Deviance explained = 14.6%
## GCV = 20.162  Scale est. = 19.942    n = 2431
plot(h2s_dm_model_d)

# Try to include as many variables as possible
h2s_dm_model_e <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp + 
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints + 
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(day >= '2022-02-01'))

summary(h2s_dm_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km + 
##     elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               8.916e+00  3.078e+00   2.896 0.003811 ** 
## year2023                 -4.825e-01  4.432e-01  -1.089 0.276410    
## as.character(weekday)Mon -7.024e-01  3.365e-01  -2.088 0.036932 *  
## as.character(weekday)Sat -7.705e-01  3.426e-01  -2.249 0.024595 *  
## as.character(weekday)Sun -1.300e+00  3.422e-01  -3.800 0.000149 ***
## as.character(weekday)Thu -3.722e-01  3.406e-01  -1.093 0.274635    
## as.character(weekday)Tue -1.518e-01  3.350e-01  -0.453 0.650577    
## as.character(weekday)Wed -4.269e-02  3.405e-01  -0.125 0.900238    
## wd_avg                    2.712e-03  1.129e-03   2.401 0.016405 *  
## ws_avg                   -2.786e-01  7.195e-02  -3.872 0.000111 ***
## daily_downwind_ref        5.252e-01  4.120e-01   1.275 0.202453    
## I(1/dist_wrp^2)          -1.689e-06  3.125e-07  -5.403 7.22e-08 ***
## capacity                 -3.644e-03  3.724e-03  -0.978 0.327933    
## daily_downwind_wrp        1.881e-01  4.335e-01   0.434 0.664394    
## monthly_oil_1km          -7.268e-05  1.628e-04  -0.446 0.655356    
## monthly_gas_1km          -3.721e-04  6.523e-04  -0.571 0.568376    
## active_1km                1.370e-02  5.147e-02   0.266 0.790046    
## elevation                -1.032e-01  6.119e-02  -1.686 0.091878 .  
## EVI                      -3.506e+00  8.125e-01  -4.315 1.66e-05 ***
## num_odor_complaints       1.538e-01  1.550e-01   0.992 0.321095    
## I(1/dist_dc^2)           -7.080e-05  1.720e-05  -4.117 3.97e-05 ***
## avg_temp                  1.721e-02  2.313e-02   0.744 0.456876    
## avg_hum                  -2.319e-02  6.853e-03  -3.384 0.000726 ***
## precip                    4.843e-01  4.540e-01   1.067 0.286252    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df     F
## s(as.numeric(month))                                    1.047   8.00 0.236
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  6.913   7.57 8.877
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 3.511  77.00 0.103
##                                                         p-value    
## s(as.numeric(month))                                    0.09437 .  
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 0.00963 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 116/118
## R-sq.(adj) =  0.144   Deviance explained = 15.5%
## GCV = 20.074  Scale est. = 19.797    n = 2431
plot(h2s_dm_model_e)

e <- getViz(h2s_dm_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_f <- gam(H2S_daily_max ~ as.character(weekday) +
                        wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp + 
                        s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))

summary(h2s_dm_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref + 
##     I(1/dist_wrp^2) + capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.502e+00  1.301e+00  -2.693  0.00725 ** 
## as.character(weekday)Mon  3.939e+00  2.581e+01   0.153  0.87873    
## as.character(weekday)Sat -1.275e+00  2.497e+01  -0.051  0.95930    
## as.character(weekday)Sun  6.518e+00  2.516e+01   0.259  0.79561    
## as.character(weekday)Thu  1.072e+01  2.548e+01   0.421  0.67406    
## as.character(weekday)Tue -1.204e+01  2.627e+01  -0.458  0.64683    
## as.character(weekday)Wed  9.587e+00  2.591e+01   0.370  0.71144    
## wd_avg                    7.981e-02  8.258e-02   0.966  0.33411    
## ws_avg                    1.029e-01  6.617e+00   0.016  0.98760    
## daily_downwind_ref       -3.077e+01  2.440e+01  -1.261  0.20754    
## I(1/dist_wrp^2)          -1.388e-03  1.470e-04  -9.442  < 2e-16 ***
## capacity                  6.856e+00  7.105e-01   9.650  < 2e-16 ***
## daily_downwind_wrp        8.300e+00  2.955e+01   0.281  0.77891    
## monthly_oil_1km           6.095e-02  1.223e-01   0.498  0.61833    
## monthly_gas_1km           3.919e-03  3.281e-01   0.012  0.99047    
## active_1km               -1.056e+02  3.923e+01  -2.693  0.00725 ** 
## elevation                 2.452e+01  7.897e+00   3.104  0.00198 ** 
## EVI                       8.040e+02  1.046e+02   7.689 4.69e-14 ***
## num_odor_complaints       3.663e+00  8.699e-01   4.211 2.85e-05 ***
## I(1/dist_dc^2)            9.637e+00  9.958e-01   9.678  < 2e-16 ***
## avg_temp                  2.544e+00  2.454e+00   1.037  0.30029    
## avg_hum                   2.180e-01  5.860e-01   0.372  0.71001    
## precip                   -6.490e+00  2.637e+01  -0.246  0.80566    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.929   8.99 12.18
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.052  80.00  4.36
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 109/112
## R-sq.(adj) =  0.511   Deviance explained = 55.8%
## GCV =  37085  Scale est. = 33454     n = 827
plot(h2s_dm_model_f)

f <- getViz(h2s_dm_model_f)
plot(sm(f, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_g <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                        wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp +
                        s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                        monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))

summary(h2s_dm_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km + 
##     elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               9.549e+00  4.812e+00   1.985 0.047239 *  
## year2021                 -3.312e-01  6.114e-01  -0.542 0.588082    
## year2022                 -2.741e-01  8.979e-01  -0.305 0.760155    
## year2023                 -4.830e-01  1.051e+00  -0.459 0.645947    
## as.character(weekday)Mon -3.089e-01  2.867e-01  -1.078 0.281286    
## as.character(weekday)Sat -5.170e-01  2.867e-01  -1.803 0.071381 .  
## as.character(weekday)Sun -8.541e-01  2.872e-01  -2.974 0.002948 ** 
## as.character(weekday)Thu -5.403e-02  2.869e-01  -0.188 0.850630    
## as.character(weekday)Tue  2.851e-01  2.853e-01   0.999 0.317608    
## as.character(weekday)Wed  6.704e-02  2.872e-01   0.233 0.815450    
## wd_avg                    1.637e-03  1.043e-03   1.570 0.116515    
## ws_avg                   -2.778e-01  6.387e-02  -4.349 1.39e-05 ***
## daily_downwind_ref        1.678e-01  2.823e-01   0.595 0.552119    
## I(1/dist_wrp^2)          -2.894e-06  5.987e-07  -4.833 1.38e-06 ***
## capacity                  3.271e-04  7.242e-03   0.045 0.963973    
## daily_downwind_wrp       -6.160e-02  3.434e-01  -0.179 0.857664    
## monthly_oil_1km          -1.339e-06  1.627e-04  -0.008 0.993434    
## monthly_gas_1km           2.002e-04  6.188e-04   0.324 0.746321    
## active_1km                2.663e-02  5.038e-02   0.529 0.597102    
## elevation                -2.393e-01  7.356e-02  -3.253 0.001147 ** 
## EVI                      -4.285e+00  1.114e+00  -3.846 0.000121 ***
## num_odor_complaints       3.875e-01  1.302e-01   2.975 0.002939 ** 
## I(1/dist_dc^2)            2.339e-04  2.144e-03   0.109 0.913132    
## avg_temp                 -6.570e-03  2.333e-02  -0.282 0.778232    
## avg_hum                  -2.571e-02  6.760e-03  -3.804 0.000144 ***
## precip                    2.603e-01  5.213e-01   0.499 0.617515    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     5.201  8.000 1.089
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   7.851  8.085 6.333
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 15.404 80.000 0.458
##                                                          p-value    
## s(as.numeric(month))                                    0.082687 .  
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 0.000162 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.0639   Deviance explained = 7.21%
## GCV =  34.93  Scale est. = 34.621    n = 5929
plot(h2s_dm_model_g)

g <- getViz(h2s_dm_model_g)
plot(sm(g, 2)) + l_fitRaster() + l_fitContour() + l_points()

Daily Average

# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(monitor_lon, monitor_lat, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(monitor_lon, monitor_lat, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -5.241e+00  6.271e-01  -8.357  < 2e-16 ***
## year2023             -1.050e-01  5.903e-02  -1.779  0.07532 .  
## as.factor(weekday).L  7.927e-02  2.550e-02   3.109  0.00190 ** 
## as.factor(weekday).Q -1.710e-01  2.550e-02  -6.705 2.49e-11 ***
## as.factor(weekday).C  2.682e-02  2.531e-02   1.060  0.28935    
## as.factor(weekday)^4  9.464e-04  2.526e-02   0.037  0.97011    
## as.factor(weekday)^5 -5.869e-02  2.514e-02  -2.334  0.01967 *  
## as.factor(weekday)^6 -4.824e-02  2.519e-02  -1.915  0.05563 .  
## wd_avg                8.074e-04  1.160e-04   6.959 4.42e-12 ***
## ws_avg               -1.194e-01  7.598e-03 -15.710  < 2e-16 ***
## daily_downwind_ref    2.721e-01  4.395e-02   6.190 7.05e-10 ***
## I(1/MinDist^2)        3.246e-04  3.119e-05  10.409  < 2e-16 ***
## I(1/dist_wrp^2)      -1.728e-05  2.306e-06  -7.495 9.24e-14 ***
## capacity              1.668e-02  1.211e-03  13.774  < 2e-16 ***
## Converted_Angle      -2.715e-03  1.050e-03  -2.585  0.00979 ** 
## monthly_oil_1km       5.069e-05  2.596e-05   1.952  0.05103 .  
## monthly_gas_1km       2.193e-04  1.024e-04   2.141  0.03236 *  
## active_1km           -2.573e-02  1.059e-02  -2.429  0.01523 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F p-value    
## s(as.numeric(month))       7.531      8 10.26  <2e-16 ***
## s(monitor_lon,monitor_lat) 8.987      9 73.16  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =  0.395   Deviance explained = 40.3%
## GCV = 0.22365  Scale est. = 0.22066   n = 2431
plot(h2s_da_model_a)

a <- getViz(h2s_da_model_a)
plot(sm(a, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Look at daily average
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.658e+00  5.258e-01  -6.958 4.44e-12 ***
## year2023             -1.051e-01  5.903e-02  -1.780   0.0752 .  
## as.factor(weekday).L  7.927e-02  2.550e-02   3.109   0.0019 ** 
## as.factor(weekday).Q -1.710e-01  2.550e-02  -6.705 2.50e-11 ***
## as.factor(weekday).C  2.682e-02  2.531e-02   1.060   0.2893    
## as.factor(weekday)^4  9.340e-04  2.526e-02   0.037   0.9705    
## as.factor(weekday)^5 -5.868e-02  2.514e-02  -2.334   0.0197 *  
## as.factor(weekday)^6 -4.823e-02  2.519e-02  -1.914   0.0557 .  
## wd_avg                8.074e-04  1.160e-04   6.958 4.43e-12 ***
## ws_avg               -1.193e-01  7.597e-03 -15.704  < 2e-16 ***
## daily_downwind_ref    2.721e-01  4.395e-02   6.191 7.02e-10 ***
## I(1/MinDist^2)        1.464e-04  1.550e-05   9.449  < 2e-16 ***
## I(1/dist_wrp^2)      -1.097e-05  1.110e-06  -9.881  < 2e-16 ***
## capacity              1.272e-02  9.049e-04  14.062  < 2e-16 ***
## Converted_Angle      -2.584e-03  1.032e-03  -2.505   0.0123 *  
## monthly_oil_1km       5.069e-05  2.596e-05   1.952   0.0510 .  
## monthly_gas_1km       2.194e-04  1.024e-04   2.142   0.0323 *  
## active_1km           -2.574e-02  1.059e-02  -2.430   0.0152 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.531      8 10.26  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.980      9 73.01  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =  0.395   Deviance explained = 40.3%
## GCV = 0.22365  Scale est. = 0.22066   n = 2431
plot(h2s_da_model_b)

b <- getViz(h2s_da_model_b)
plot(sm(b, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_c <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4.449e+00  4.264e-01 -10.432  < 2e-16 ***
## year2023             -1.035e-01  5.908e-02  -1.752  0.07997 .  
## as.factor(weekday).L  7.916e-02  2.553e-02   3.101  0.00195 ** 
## as.factor(weekday).Q -1.711e-01  2.553e-02  -6.700 2.58e-11 ***
## as.factor(weekday).C  2.689e-02  2.534e-02   1.061  0.28862    
## as.factor(weekday)^4  1.225e-03  2.528e-02   0.048  0.96137    
## as.factor(weekday)^5 -5.902e-02  2.517e-02  -2.345  0.01910 *  
## as.factor(weekday)^6 -4.850e-02  2.522e-02  -1.923  0.05457 .  
## wd_avg                8.020e-04  1.161e-04   6.906 6.35e-12 ***
## ws_avg               -1.206e-01  7.591e-03 -15.883  < 2e-16 ***
## daily_downwind_ref    2.709e-01  4.400e-02   6.156 8.70e-10 ***
## capacity              1.334e-02  8.779e-04  15.194  < 2e-16 ***
## I(1/dist_wrp^2)      -1.345e-05  1.080e-06 -12.454  < 2e-16 ***
## monthly_oil_1km       5.039e-05  2.598e-05   1.939  0.05257 .  
## monthly_gas_1km       2.202e-04  1.025e-04   2.148  0.03182 *  
## active_1km           -2.551e-02  1.060e-02  -2.406  0.01620 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.528      8 10.16  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.991      9 78.58  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 32/33
## R-sq.(adj) =  0.394   Deviance explained = 40.1%
## GCV = 0.22402  Scale est. = 0.22112   n = 2431
plot(h2s_da_model_c)

c <- getViz(h2s_da_model_c)
plot(sm(c, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_d <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km + daily_downwind_wrp + 
##     elevation + EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.112e+00  7.526e-01   1.478  0.13948    
## year2023                 -1.057e-01  5.901e-02  -1.792  0.07328 .  
## as.character(weekday)Mon -8.975e-02  3.557e-02  -2.523  0.01169 *  
## as.character(weekday)Sat -9.870e-02  3.615e-02  -2.730  0.00637 ** 
## as.character(weekday)Sun -1.985e-01  3.594e-02  -5.522 3.72e-08 ***
## as.character(weekday)Thu -5.020e-02  3.597e-02  -1.396  0.16289    
## as.character(weekday)Tue  6.448e-03  3.539e-02   0.182  0.85543    
## as.character(weekday)Wed  5.342e-02  3.590e-02   1.488  0.13694    
## wd_avg                    8.069e-04  1.160e-04   6.955 4.54e-12 ***
## ws_avg                   -1.197e-01  7.601e-03 -15.753  < 2e-16 ***
## daily_downwind_ref        2.736e-01  4.391e-02   6.231 5.45e-10 ***
## capacity                  9.779e-04  1.519e-03   0.644  0.51975    
## I(1/dist_wrp^2)          -1.370e-07  5.649e-08  -2.426  0.01536 *  
## monthly_oil_1km           5.052e-05  2.595e-05   1.947  0.05166 .  
## monthly_gas_1km           2.217e-04  1.024e-04   2.165  0.03050 *  
## active_1km               -2.590e-02  1.059e-02  -2.446  0.01451 *  
## daily_downwind_wrp        5.198e-02  4.576e-02   1.136  0.25617    
## elevation                -1.643e-02  8.572e-03  -1.917  0.05540 .  
## EVI                      -1.137e+00  1.537e-01  -7.399 1.88e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.525  8.000 10.29  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.917  7.997 43.19  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 35/36
## R-sq.(adj) =  0.395   Deviance explained = 40.3%
## GCV = 0.22371  Scale est. = 0.22063   n = 2431
plot(h2s_da_model_d)

d <- getViz(h2s_da_model_d)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_e <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.340e+00  3.648e-01  -3.673 0.000245 ***
## year2023                  2.022e-01  1.393e-01   1.451 0.146918    
## as.character(weekday)Mon -8.888e-02  2.901e-02  -3.063 0.002216 ** 
## as.character(weekday)Sat -9.492e-02  2.944e-02  -3.224 0.001283 ** 
## as.character(weekday)Sun -1.983e-01  2.931e-02  -6.767 1.66e-11 ***
## as.character(weekday)Thu -4.763e-02  2.934e-02  -1.623 0.104656    
## as.character(weekday)Tue -2.590e-04  2.890e-02  -0.009 0.992851    
## as.character(weekday)Wed  4.657e-02  2.927e-02   1.591 0.111728    
## wd_avg                    6.755e-04  9.969e-05   6.777 1.55e-11 ***
## ws_avg                   -1.099e-01  6.457e-03 -17.027  < 2e-16 ***
## daily_downwind_ref        2.112e-01  3.650e-02   5.786 8.17e-09 ***
## capacity                  6.672e-03  7.272e-04   9.174  < 2e-16 ***
## I(1/dist_wrp^2)           2.738e-07  1.034e-07   2.648 0.008160 ** 
## monthly_oil_1km           6.383e-05  4.342e-05   1.470 0.141732    
## monthly_gas_1km          -3.474e-05  2.298e-04  -0.151 0.879834    
## active_1km               -6.203e-03  1.696e-02  -0.366 0.714536    
## daily_downwind_wrp        5.572e-02  3.797e-02   1.467 0.142417    
## elevation                -4.655e-02  1.040e-02  -4.475 8.00e-06 ***
## EVI                      -1.023e+00  1.397e-01  -7.319 3.43e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.446  8.000  3.826
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.471  8.471 24.499
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.663 77.000 16.791
##                                                          p-value    
## s(as.numeric(month))                                    7.44e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/113
## R-sq.(adj) =    0.6   Deviance explained = 61.8%
## GCV = 0.15284  Scale est. = 0.14598   n = 2431
plot(h2s_da_model_e)

e <- getViz(h2s_da_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Since feb 2022
h2s_da_model_f <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.444e+00  3.800e-01  -3.799 0.000149 ***
## year2023                  1.574e-01  1.394e-01   1.129 0.259160    
## as.character(weekday)Mon -8.921e-02  2.785e-02  -3.203 0.001376 ** 
## as.character(weekday)Sat -9.195e-02  2.820e-02  -3.261 0.001128 ** 
## as.character(weekday)Sun -2.158e-01  2.825e-02  -7.638 3.20e-14 ***
## as.character(weekday)Thu -4.794e-02  2.810e-02  -1.706 0.088066 .  
## as.character(weekday)Tue -2.705e-03  2.770e-02  -0.098 0.922221    
## as.character(weekday)Wed  3.146e-02  2.809e-02   1.120 0.262844    
## wd_avg                    3.607e-04  9.868e-05   3.655 0.000263 ***
## ws_avg                   -8.212e-02  6.477e-03 -12.679  < 2e-16 ***
## daily_downwind_ref        1.567e-01  3.519e-02   4.454 8.84e-06 ***
## capacity                  6.189e-03  7.279e-04   8.503  < 2e-16 ***
## I(1/dist_wrp^2)           2.902e-07  9.986e-08   2.907 0.003687 ** 
## monthly_oil_1km           1.470e-04  4.418e-05   3.327 0.000891 ***
## monthly_gas_1km          -3.235e-04  2.245e-04  -1.441 0.149619    
## active_1km               -2.002e-02  1.695e-02  -1.181 0.237681    
## daily_downwind_wrp        7.073e-02  3.636e-02   1.945 0.051897 .  
## elevation                -4.407e-02  9.975e-03  -4.418 1.04e-05 ***
## EVI                      -9.561e-01  1.355e-01  -7.059 2.21e-12 ***
## num_odor_complaints       2.644e-02  1.318e-02   2.006 0.044921 *  
## I(1/dist_dc^2)            9.134e-05  8.252e-06  11.069  < 2e-16 ***
## avg_temp                  8.334e-03  2.861e-03   2.913 0.003618 ** 
## avg_hum                  -6.523e-03  7.504e-04  -8.694  < 2e-16 ***
## precip                   -5.642e-02  3.864e-02  -1.460 0.144353    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.719  8.000  4.319
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.471  8.471 24.628
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.673 77.000 18.676
##                                                          p-value    
## s(as.numeric(month))                                    2.29e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 115/118
## R-sq.(adj) =  0.634   Deviance explained = 65.1%
## GCV = 0.14014  Scale est. = 0.1336    n = 2431
plot(h2s_da_model_f)

f <- getViz(h2s_da_model_f)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Disaster only
h2s_da_model_g <- gam(H2S_daily_avg ~ month + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))
summary(h2s_da_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ month + as.character(weekday) + wd_avg + ws_avg + 
##     daily_downwind_ref + capacity + I(1/dist_wrp^2) + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints + 
##     I(1/dist_dc^2) + avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.2557355  0.1054716  -2.425  0.01556 *  
## month11                  -4.7575720  1.9622093  -2.425  0.01556 *  
## month12                   1.2193191  0.5029553   2.424  0.01557 *  
## as.character(weekday)Mon -2.7259172  2.9255211  -0.932  0.35176    
## as.character(weekday)Sat -2.8891542  2.8298781  -1.021  0.30761    
## as.character(weekday)Sun -2.4956249  2.8514127  -0.875  0.38173    
## as.character(weekday)Thu -0.3339398  2.8872365  -0.116  0.90795    
## as.character(weekday)Tue -4.5461381  2.9772550  -1.527  0.12720    
## as.character(weekday)Wed -1.2618609  2.9362292  -0.430  0.66750    
## wd_avg                    0.0080694  0.0093628   0.862  0.38904    
## ws_avg                   -0.0079586  0.7510760  -0.011  0.99155    
## daily_downwind_ref       -2.7311456  2.7658732  -0.987  0.32375    
## capacity                  0.7818620  0.0804573   9.718  < 2e-16 ***
## I(1/dist_wrp^2)          -0.0001603  0.0000167  -9.600  < 2e-16 ***
## monthly_oil_1km          -0.0069716  0.0099354  -0.702  0.48309    
## monthly_gas_1km           0.0191159  0.0350103   0.546  0.58522    
## active_1km               -7.7141096  3.1814972  -2.425  0.01556 *  
## daily_downwind_wrp       -0.8402903  3.3511018  -0.251  0.80208    
## elevation                 2.7201925  0.8943859   3.041  0.00244 ** 
## EVI                      90.9526595 11.8445967   7.679 5.05e-14 ***
## num_odor_complaints       0.5226646  0.0988284   5.289 1.62e-07 ***
## I(1/dist_dc^2)            1.1056957  0.1127784   9.804  < 2e-16 ***
## avg_temp                  0.3827439  0.2785720   1.374  0.16987    
## avg_hum                   0.0272100  0.0665065   0.409  0.68256    
## precip                   -2.5913331  2.9921008  -0.866  0.38674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.926  8.989 12.392
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.823 80.000  3.456
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 109/114
## R-sq.(adj) =  0.501   Deviance explained =   55%
## GCV = 476.76  Scale est. = 429.63    n = 827
plot(h2s_da_model_g)

g <- getViz(h2s_da_model_g)
plot(sm(g, 1)) + l_fitRaster() + l_fitContour() + l_points()

# Exclude disaster
h2s_da_model_h <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
summary(h2s_da_model_h)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -5.211e+00  1.292e+00  -4.034 5.56e-05 ***
## year2021                  3.980e-01  1.063e-01   3.745 0.000182 ***
## year2022                 -5.043e-01  1.975e-01  -2.553 0.010690 *  
## year2023                 -5.863e-01  1.951e-01  -3.006 0.002662 ** 
## as.character(weekday)Mon -5.663e-02  2.403e-02  -2.356 0.018491 *  
## as.character(weekday)Sat -8.496e-02  2.400e-02  -3.539 0.000404 ***
## as.character(weekday)Sun -1.725e-01  2.406e-02  -7.170 8.41e-13 ***
## as.character(weekday)Thu -1.198e-02  2.402e-02  -0.499 0.617931    
## as.character(weekday)Tue -1.297e-02  2.391e-02  -0.542 0.587578    
## as.character(weekday)Wed  1.436e-02  2.407e-02   0.596 0.550993    
## wd_avg                    1.596e-04  8.894e-05   1.795 0.072754 .  
## ws_avg                   -7.186e-02  5.614e-03 -12.801  < 2e-16 ***
## daily_downwind_ref        1.371e-02  2.425e-02   0.565 0.571907    
## capacity                  1.852e-02  2.451e-03   7.557 4.77e-14 ***
## I(1/dist_wrp^2)          -7.037e-06  2.159e-06  -3.260 0.001122 ** 
## monthly_oil_1km          -2.609e-05  1.723e-05  -1.514 0.130036    
## monthly_gas_1km          -1.911e-04  6.998e-05  -2.730 0.006344 ** 
## active_1km               -3.876e-02  7.232e-03  -5.360 8.63e-08 ***
## daily_downwind_wrp        3.779e-02  2.907e-02   1.300 0.193671    
## elevation                 6.682e-02  1.179e-02   5.665 1.54e-08 ***
## EVI                       8.998e-01  2.927e-01   3.074 0.002122 ** 
## num_odor_complaints       2.776e-02  1.131e-02   2.454 0.014152 *  
## I(1/dist_dc^2)            5.424e-02  1.576e-02   3.442 0.000582 ***
## avg_temp                  5.816e-03  2.110e-03   2.757 0.005851 ** 
## avg_hum                  -6.005e-03  5.943e-04 -10.106  < 2e-16 ***
## precip                   -3.502e-02  4.400e-02  -0.796 0.426051    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.898      8 41.37
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 26.18
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 76.137     80 23.23
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.454   Deviance explained = 46.5%
## GCV = 0.24724  Scale est. = 0.24236   n = 5929
plot(h2s_da_model_h)

h <- getViz(h2s_da_model_h)
plot(sm(h, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Disaster indicator
h2s_da_model_i <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip + disaster, 
                      data = daily_full %>% mutate(disaster = if_else(year == '2021', 
                                                                      month %in% c('10', '11', '12'), 1, 0)))
summary(h2s_da_model_i)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip + disaster
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.345e+02  1.384e+01 -24.166  < 2e-16 ***
## year2021                  3.786e+00  1.773e+00   2.135  0.03276 *  
## year2022                  3.927e+00  2.212e+00   1.775  0.07589 .  
## year2023                  1.876e+00  2.543e+00   0.738  0.46077    
## as.character(weekday)Mon -3.304e-01  3.918e-01  -0.843  0.39914    
## as.character(weekday)Sat -2.755e-01  3.907e-01  -0.705  0.48077    
## as.character(weekday)Sun -3.018e-01  3.908e-01  -0.772  0.44001    
## as.character(weekday)Thu -3.277e-01  3.912e-01  -0.838  0.40225    
## as.character(weekday)Tue -6.803e-01  3.901e-01  -1.744  0.08123 .  
## as.character(weekday)Wed -3.763e-01  3.915e-01  -0.961  0.33648    
## wd_avg                    1.342e-03  1.408e-03   0.954  0.34022    
## ws_avg                   -7.416e-02  9.036e-02  -0.821  0.41185    
## daily_downwind_ref       -6.152e-02  3.919e-01  -0.157  0.87528    
## capacity                  6.528e-01  2.477e-02  26.352  < 2e-16 ***
## I(1/dist_wrp^2)          -2.505e-04  1.063e-05 -23.576  < 2e-16 ***
## monthly_oil_1km           2.888e-04  2.638e-04   1.095  0.27359    
## monthly_gas_1km           6.262e-04  1.004e-03   0.624  0.53293    
## active_1km                2.737e-02  9.507e-02   0.288  0.77341    
## daily_downwind_wrp        4.122e-01  4.694e-01   0.878  0.38000    
## elevation                 2.384e+00  1.499e-01  15.904  < 2e-16 ***
## EVI                       7.512e+01  3.050e+00  24.632  < 2e-16 ***
## num_odor_complaints       1.000e+00  2.806e-02  35.645  < 2e-16 ***
## I(1/dist_dc^2)            1.932e+00  7.865e-02  24.569  < 2e-16 ***
## avg_temp                  2.012e-02  3.278e-02   0.614  0.53943    
## avg_hum                  -1.387e-04  8.989e-03  -0.015  0.98769    
## precip                   -2.459e-01  6.049e-01  -0.406  0.68440    
## disaster                  3.501e+00  1.127e+00   3.105  0.00191 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(as.numeric(month))                                     4.41      8  1.537
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 74.183
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 66.07     80  4.241
##                                                         p-value    
## s(as.numeric(month))                                    0.00666 ** 
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/124
## R-sq.(adj) =   0.32   Deviance explained =   33%
## GCV = 74.332  Scale est. = 73.182    n = 6756
plot(h2s_da_model_i)

i <- getViz(h2s_da_model_i)
plot(sm(i, 2)) + l_fitRaster() + l_fitContour() + l_points()

plotRGL(sm(i, 2), fix = c(`as.numeric(day)` = 0), residuals = F)

# try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_i), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Everything
h2s_da_model_j <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full)
summary(h2s_da_model_j)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.317e+02  1.379e+01 -24.050   <2e-16 ***
## year2021                  7.002e-01  1.447e+00   0.484   0.6285    
## year2022                  7.250e-01  1.920e+00   0.378   0.7057    
## year2023                 -7.798e-01  2.332e+00  -0.334   0.7381    
## as.character(weekday)Mon -3.531e-01  3.920e-01  -0.901   0.3677    
## as.character(weekday)Sat -2.631e-01  3.909e-01  -0.673   0.5009    
## as.character(weekday)Sun -3.038e-01  3.911e-01  -0.777   0.4373    
## as.character(weekday)Thu -3.481e-01  3.914e-01  -0.889   0.3739    
## as.character(weekday)Tue -7.030e-01  3.903e-01  -1.801   0.0717 .  
## as.character(weekday)Wed -3.883e-01  3.917e-01  -0.991   0.3217    
## wd_avg                    1.313e-03  1.408e-03   0.933   0.3510    
## ws_avg                   -6.694e-02  9.034e-02  -0.741   0.4587    
## daily_downwind_ref       -8.039e-02  3.921e-01  -0.205   0.8376    
## capacity                  6.535e-01  2.475e-02  26.404   <2e-16 ***
## I(1/dist_wrp^2)          -2.516e-04  1.055e-05 -23.849   <2e-16 ***
## monthly_oil_1km           3.635e-04  2.615e-04   1.390   0.1646    
## monthly_gas_1km           5.075e-04  9.945e-04   0.510   0.6099    
## active_1km                3.354e-02  9.390e-02   0.357   0.7210    
## daily_downwind_wrp        3.977e-01  4.697e-01   0.847   0.3972    
## elevation                 2.388e+00  1.499e-01  15.930   <2e-16 ***
## EVI                       7.526e+01  3.046e+00  24.712   <2e-16 ***
## num_odor_complaints       1.019e+00  2.746e-02  37.121   <2e-16 ***
## I(1/dist_dc^2)            1.934e+00  7.828e-02  24.705   <2e-16 ***
## avg_temp                  2.961e-02  3.264e-02   0.907   0.3643    
## avg_hum                  -1.644e-04  8.979e-03  -0.018   0.9854    
## precip                   -2.784e-01  6.048e-01  -0.460   0.6453    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(as.numeric(month))                                     4.10      8  1.261
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 74.816
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 65.67     80  4.096
##                                                         p-value    
## s(as.numeric(month))                                     0.0158 *  
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.319   Deviance explained = 32.9%
## GCV = 74.425  Scale est. = 73.293    n = 6756
plot(h2s_da_model_j)

knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Disaster Indicator',
                              'Everything'),
                    'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))))
Model R-Sq
Since Feb 2022 0.63
Disaster Only 0.50
Exclude Disaster 0.45
Disaster Indicator 0.32
Everything 0.32

80/20 Cross Validation on Daily Average models

result <- tibble(Model = character(),
                 '80/20 Train R-Sq' = numeric(),
                 '80/20 Test R-Sq' = numeric())

predictors <- c('H2S_daily_avg', 'month', 'year', 'weekday', 'wd_avg', 'ws_avg', 'daily_downwind_ref', 'dist_wrp', 
                'mon_utm_x', 'mon_utm_y', 'day', 'monthly_oil_1km', 'monthly_gas_1km', 
                'active_1km', 'daily_downwind_wrp', 'elevation', 'EVI', 'num_odor_complaints',
                'dist_dc', 'capacity', 'avg_temp', 'avg_hum', 'precip') 

set.seed(313)

# model 1: since Feb 2022
train <- daily_full %>% 
  filter(day >= '2022-02-01') %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  sample_frac(0.8)

test <- anti_join(daily_full %>% 
                  filter(day >= '2022-02-01') %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f2 <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f2, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2)$n - 1)/
  (summary(h2s_da_model_f2)$n - summary(h2s_da_model_f2)$np - 1)

result <- rbind(result, tibble(Model = 'Since Feb 2022',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f2)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))


# Model 2: Disaster (Oct-Dec 2021) only
train <- daily_full %>% 
  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  sample_frac(0.8)

test <- anti_join(daily_full %>% 
                  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f3 <- gam(H2S_daily_avg~as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f3, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3)$n - 1)/
  (summary(h2s_da_model_f3)$n - summary(h2s_da_model_f3)$np - 1)

result <- rbind(result, tibble(Model = 'Disaster Only',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f3)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 3: Excluding Disaster
train <- daily_full %>% 
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  sample_frac(0.8)

test <- anti_join(daily_full %>% 
                  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f4 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f4, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4)$n - 1)/
  (summary(h2s_da_model_f4)$n - summary(h2s_da_model_f4)$np - 1)

result <- rbind(result, tibble(Model = 'Exclude Disaster',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f4)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 4: Disaster Indicator
train <- daily_full %>% 
  select(all_of(predictors)) %>% 
  mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>% 
  filter(complete.cases(.)) 

train <- rbind(
  train %>% filter(disaster==1) %>% sample_frac(0.8),
  train %>% filter(disaster==0) %>% sample_frac(0.8)
)

test <- anti_join(daily_full %>% 
                  select(all_of(predictors)) %>% 
                    mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip, disaster)`
h2s_da_model_f5 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         disaster +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f5, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5)$n - 1)/
  (summary(h2s_da_model_f5)$n - summary(h2s_da_model_f5)$np - 1)

result <- rbind(result, tibble(Model = 'Disaster Indicator',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f5)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 5: Everything
train <- daily_full %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  sample_frac(0.8)

train <- rbind(
  train %>% filter(year == '2021' & month %in% c('10', '11', '12')) %>% sample_frac(0.8),
  train %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% sample_frac(0.8)
  )

test <- anti_join(daily_full %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f6 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f6, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6)$n - 1)/
  (summary(h2s_da_model_f5)$n - summary(h2s_da_model_f6)$np - 1)

result <- rbind(result, tibble(Model = 'Everything',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f6)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))
knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Disaster Indicator',
                              'Everything'),
                    'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))) %>% 
               left_join(result, join_by(Model)))
Model R-Sq 80/20 Train R-Sq 80/20 Test R-Sq
Since Feb 2022 0.63 0.6221825 0.6290050
Disaster Only 0.50 0.5148863 -0.1386304
Exclude Disaster 0.45 0.4474583 0.4523375
Disaster Indicator 0.32 0.3225253 0.2875708
Everything 0.32 0.3812678 0.4193177

Cross Validation on each monitor

result_cv <- tibble(Model = character(),
                 'CV Avg Train R-Sq' = numeric(),
                 'CV Avg Test R-Sq' = numeric())

# model 1: since Feb 2022
post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01') %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(post2022feb$Monitor)

cv1_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- post2022feb %>%
    filter(Monitor != monitors[i])
  
  test <- post2022feb %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f2b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f2b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
    (summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
  
  cv1_result <- rbind(cv1_result, tibble(monitors[i],
                                         summary(h2s_da_model_f2b)$r.sq, 
                                         summary(h2s_da_model_f2b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Since Feb 2022',
                                           'CV Avg Train R-Sq' = mean(cv1_result$`summary(h2s_da_model_f2b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv1_result$adj_r2_test))))

# model 2: Disaster only
disaster <- daily_full %>% 
  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(disaster$Monitor)

cv2_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- disaster %>%
    filter(Monitor != monitors[i])
  
  test <- disaster %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f3b <- gam(H2S_daily_avg~ month + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f3b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3b)$n - 1)/
    (summary(h2s_da_model_f3b)$n - summary(h2s_da_model_f3b)$np - 1)
  
  cv2_result <- rbind(cv2_result, tibble(monitors[i],
                                         summary(h2s_da_model_f3b)$r.sq, 
                                         summary(h2s_da_model_f3b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Disaster Only',
                                           'CV Avg Train R-Sq' = mean(cv2_result$`summary(h2s_da_model_f3b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv2_result$adj_r2_test))))

# model 3: Exclude Disaster
exclude_disaster <- daily_full %>% 
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(exclude_disaster$Monitor)

cv3_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- exclude_disaster %>%
    filter(Monitor != monitors[i])
  
  test <- exclude_disaster %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f4b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f4b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4b)$n - 1)/
    (summary(h2s_da_model_f4b)$n - summary(h2s_da_model_f4b)$np - 1)
  
  cv3_result <- rbind(cv3_result, tibble(monitors[i],
                                         summary(h2s_da_model_f4b)$r.sq, 
                                         summary(h2s_da_model_f4b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Exclude Disaster',
                                           'CV Avg Train R-Sq' = mean(cv3_result$`summary(h2s_da_model_f4b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv3_result$adj_r2_test))))

# model 4: Disaster Indicator 
full_complete <- daily_full %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.)) %>%
  mutate(disaster = if_else(year == '2021' &  month %in% c('10', '11', '12'), 1, 0))

monitors <- unique(full_complete$Monitor)

cv4_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- full_complete %>%
    filter(Monitor != monitors[i])
  
  test <- full_complete %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f5b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         disaster + capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f5b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5b)$n - 1)/
    (summary(h2s_da_model_f5b)$n - summary(h2s_da_model_f5b)$np - 1)
  
  cv4_result <- rbind(cv4_result, tibble(monitors[i],
                                         summary(h2s_da_model_f5b)$r.sq, 
                                         summary(h2s_da_model_f5b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Disaster Indicator',
                                           'CV Avg Train R-Sq' = mean(cv4_result$`summary(h2s_da_model_f5b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv4_result$adj_r2_test))))

# model 5: Everything 
full_complete <- daily_full %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(full_complete$Monitor)

cv5_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- full_complete %>%
    filter(Monitor != monitors[i])
  
  test <- full_complete %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f6b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f6b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6b)$n - 1)/
    (summary(h2s_da_model_f6b)$n - summary(h2s_da_model_f6b)$np - 1)
  
  cv5_result <- rbind(cv5_result, tibble(monitors[i],
                                         summary(h2s_da_model_f6b)$r.sq, 
                                         summary(h2s_da_model_f6b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything',
                                           'CV Avg Train R-Sq' = mean(cv5_result$`summary(h2s_da_model_f6b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv5_result$adj_r2_test))))

cv1_result
cv2_result
cv3_result
cv4_result
cv5_result
knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Disaster Indicator',
                              'Everything'),
                    'Train R-sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))) %>% 
               left_join(result, join_by(Model)) %>%
               left_join(result_cv, join_by(Model)))
Model Train R-sq 80/20 Train R-Sq 80/20 Test R-Sq CV Avg Train R-Sq CV Avg Test R-Sq
Since Feb 2022 0.63 0.6221825 0.6290050 0.6388156 0.1712204
Disaster Only 0.50 0.5148863 -0.1386304 0.6710647 0.0892710
Exclude Disaster 0.45 0.4474583 0.4523375 0.4655053 0.0669122
Disaster Indicator 0.32 0.3225253 0.2875708 0.4928048 0.1052525
Everything 0.32 0.3812678 0.4193177 0.4911042 0.1003518

Monthly

h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.484e+00  3.008e-01   4.933 8.11e-07 ***
## year2021                          4.554e+00  1.702e-01  26.763  < 2e-16 ***
## year2022                         -4.996e+00  1.893e-01 -26.389  < 2e-16 ***
## year2023                         -2.289e+00  2.387e-01  -9.590  < 2e-16 ***
## wd_secQ2                         -9.342e-02  1.780e-01  -0.525      0.6    
## wd_secQ3                          1.659e+00  1.810e-01   9.163  < 2e-16 ***
## wd_secQ4                         -8.824e-01  1.711e-01  -5.158 2.50e-07 ***
## ws                                7.898e-02  1.923e-02   4.107 4.01e-05 ***
## I(1/MinDist^2)                    9.128e+05  1.544e+05   5.913 3.37e-09 ***
## RefineryMarathon (Carson)         2.223e+01  2.465e-01  90.188  < 2e-16 ***
## RefineryMarathon (Wilmington)     3.671e+00  2.866e-01  12.809  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)  2.444e+00  2.532e-01   9.653  < 2e-16 ***
## RefineryTorrance Refinery        -1.154e+00  2.331e-01  -4.950 7.42e-07 ***
## RefineryValero Refinery           5.944e+00  2.803e-01  21.205  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                      edf Ref.df    F p-value    
## s(as.numeric(month))   8      8 5718  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0357   Deviance explained = 3.57%
## GCV = 5417.7  Scale est. = 5417.6    n = 2057242
plot(h2s_ma_model_a)

# h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=10), data = full_data)
# summary(h2s_ma_model_b)
# plot(h2s_ma_model_b)

XGBoost Since 2022 Feb

xgb_result <- tibble(Model = character(),
                 'R-Sq' = numeric(),
                 '80/20 Train R-Sq' = numeric(),
                 '80/20 Test R-Sq' = numeric())

daily_full <- daily_full %>%
  mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
         Monitor = str_replace_all(Monitor, ' ', '_'),
         weekday = as.character(weekday),
         daily_downwind_ref = as.integer(daily_downwind_ref),
         daily_downwind_wrp = as.integer(daily_downwind_wrp))

train <- daily_full[complete.cases(daily_full),] %>%
  filter(day >= '2022-02-01')

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
                         max_depth = c(3, 4, 5),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", 
                        number=10,
                        verboseIter=TRUE, 
                        search='grid')

fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
fit.xgb_da$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.2 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 42 
## niter: 500
## nfeatures : 42 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 78     500         5 0.1 0.001              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 20)

80/20 CV

set.seed(313)

train <- daily_full[complete.cases(daily_full),] %>%
  filter(day >= '2022-02-01') %>%
  sample_frac(0.8)

test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(day >= '2022-02-01'), 
                  train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)


# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
                         max_depth = c(3, 4, 5),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", 
                        number=10,
                        verboseIter=TRUE, 
                        search='grid')

fit.xgb_da_8020 <- readRDS('rfiles/fit.xgb_da_8020.rds')
# fit.xgb_da_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_8020, 'rfiles/fit.xgb_da_8020.rds')
getTrainPerf(fit.xgb_da_8020)
# Test statistics
test_stat <- postResample(predict(fit.xgb_da_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix()), 
             test %>% pull(H2S_daily_avg))
test_stat
##      RMSE  Rsquared       MAE 
## 0.2871070 0.7610102 0.1862030
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Since Feb 2022',
                           'R-Sq' = getTrainPerf(fit.xgb_da)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
fit.xgb_da_8020$finalModel
## ##### xgb.Booster
## raw: 1.2 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 42 
## niter: 500
## nfeatures : 42 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96     500         5 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_8020$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_8020$finalModel, top_n = 20)

CV by leaving each monitor out once

post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01')

monitor_names <- unique(post2022feb$Monitor)

# for (i in 1:length(monitor_names)) {
#   train <- post2022feb %>%

#     filter(Monitor != monitor_names[i])
# 
#   train <- train[complete.cases(train),]
# 
#   train <- fastDummies::dummy_cols(train %>%
#                      select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
#                              monitor_lat, monitor_lon, county, dist_213)) %>%
#                      mutate(MinDist = 1/(MinDist^2),
#                             dist_wrp = 1/(dist_wrp^2)),
#                      remove_selected_columns = TRUE)
# 
#   fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# 
#   saveRDS(fit.xgb_da, paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
# }
model_stats <- tibble(Monitor = character(), train_r2 = numeric(), test_r2 = numeric())

add_cols <- function(df, cols) {
  add <- cols[!cols %in% names(df)]
  if(length(add) !=0 ) df[add] <- NA
  return(df)
}

for (i in setdiff(1:length(monitor_names), c(9))) {

  fit.xgb_da <- readRDS(paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))

  test <- post2022feb %>%
    filter(Monitor == monitor_names[i])

  test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
                     ungroup() %>%
                     select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                       add_cols(c('year_2022', 'year_2023', 'month_01', 'month_02',
                                  'month_03', 'month_04', 'month_05', 'month_06',
                                  'month_07', 'month_08', 'month_09', 'month_10',
                                  'month_11', 'month_12')) %>%
                     mutate(MinDist = 1/(MinDist^2),
                            dist_wrp = 1/(dist_wrp^2),
                            year_2022 = ifelse(is.na(year_2022), 0, year_2022),
                            year_2023 = ifelse(is.na(year_2023), 0, year_2023),
                            month_01 = ifelse(is.na(month_01), 0, month_01),
                            month_02 = ifelse(is.na(month_02), 0, month_02),
                            month_03 = ifelse(is.na(month_03), 0, month_03),
                            month_04 = ifelse(is.na(month_04), 0, month_04),
                            month_05 = ifelse(is.na(month_05), 0, month_05),
                            month_06 = ifelse(is.na(month_06), 0, month_06),
                            month_07 = ifelse(is.na(month_07), 0, month_07),
                            month_08 = ifelse(is.na(month_08), 0, month_08),
                            month_09 = ifelse(is.na(month_09), 0, month_09),
                            month_10 = ifelse(is.na(month_10), 0, month_10),
                            month_11 = ifelse(is.na(month_11), 0, month_11),
                            month_12 = ifelse(is.na(month_12), 0, month_12)),
                     remove_selected_columns = TRUE)

  train_r2 <- getTrainPerf(fit.xgb_da)$TrainRsquared

  predictions <- predict(fit.xgb_da$finalModel,
                         newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())

  test_r2 <- R2(test %>% pull(H2S_daily_avg), predictions)

  model_stats <- rbind(model_stats, tibble(Monitor = monitor_names[i], train_r2, test_r2))
}

model_stats
tibble(Monitor = 'Total Average', train_r2 = mean(model_stats$train_r2, na.rm = T),
                             test_r2 = mean(model_stats$test_r2, na.rm = T))

XGBoost Disaster

Daily average model

train <- daily_full[complete.cases(daily_full),] %>%
  filter(year == '2021', month %in% c('10', '11', '12'))

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

fit.xgb_da_dis <- readRDS('rfiles/fit.xgb_da_dis.rds')
# fit.xgb_da_dis <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis, 'rfiles/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 500
## nfeatures : 31 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96     500         5 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
              model = fit.xgb_da_dis$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
              model = fit.xgb_da_dis$finalModel, top_n = 20)

80/20 CV

set.seed(313)

train <- daily_full[complete.cases(daily_full),] %>%
  filter(year == '2021', month %in% c('10', '11', '12')) %>%
  sample_frac(0.8)

test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(year == '2021', month %in% c('10', '11', '12')), 
                  train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213, year)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)

fit.xgb_da_dis_8020 <- readRDS('rfiles/fit.xgb_da_dis_8020.rds')
# fit.xgb_da_dis_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis_8020, 'rfiles/fit.xgb_da_dis_8020.rds')
getTrainPerf(fit.xgb_da_dis_8020)
# Test statistics
test_stat <- postResample(predict(fit.xgb_da_dis_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix()), 
             test %>% pull(H2S_daily_avg))
test_stat
##       RMSE   Rsquared        MAE 
## 21.0148229  0.6897725  3.6093483
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Disaster Only',
                           'R-Sq' = getTrainPerf(fit.xgb_da_dis)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_dis_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
fit.xgb_da_dis_8020$finalModel
## ##### xgb.Booster
## raw: 209.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 31 
## niter: 100
## nfeatures : 31 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 205     100         5 0.3  0.01              0.5                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_dis_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_dis_8020$finalModel, top_n = 20)

XGBoost Full

Daily average model

train <- daily_full[complete.cases(daily_full),]

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

fit.xgb_da_full <- readRDS('rfiles/fit.xgb_da_full.rds')
# fit.xgb_da_full <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full, 'rfiles/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 114.2 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "3", gamma = "0.001", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.5", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 40 
## niter: 100
## nfeatures : 40 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##    nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1     100         3 0.1 0.001              0.5                0       0.5
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
              model = fit.xgb_da_full$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
              model = fit.xgb_da_full$finalModel, top_n = 20)

80/20 CV

set.seed(313)

train <- rbind(
  daily_full[complete.cases(daily_full),] %>%
    filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
    sample_frac(0.8),
  daily_full[complete.cases(daily_full),] %>%
    filter(year == '2021', month %in% c('10', '11', '12')) %>%
    sample_frac(0.8)
)

test <- anti_join(daily_full[complete.cases(daily_full),], train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213, year)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)

fit.xgb_da_full_8020 <- readRDS('rfiles/fit.xgb_da_full_8020.rds')
# fit.xgb_da_full_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full_8020, 'rfiles/fit.xgb_da_full_8020.rds')
getTrainPerf(fit.xgb_da_full_8020)
# Test statistics
test_stat <- postResample(predict(fit.xgb_da_full_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix()), 
             test %>% pull(H2S_daily_avg))
test_stat
##      RMSE  Rsquared       MAE 
## 5.5777177 0.7863225 0.5455890
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Everything',
                           'R-Sq' = getTrainPerf(fit.xgb_da_full_8020)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_full_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
predictions <- predict(fit.xgb_da_full_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
par(mfrow=c(1,2))
plot(test %>% pull(H2S_daily_avg) ~ predictions, asp=1, pch=20,
     ylab="Observed", xlab="Predicted", main="Full graph")
abline(0,1); grid()
plot(test %>% pull(H2S_daily_avg) ~ predictions, asp=1, pch=20,
     ylab="Observed",xlab="Predicted",  main="Zoomed in",
     xlim = c(0, 15), ylim = c(0, 15))
grid(); abline(0,1)

fit.xgb_da_full_8020$finalModel
## ##### xgb.Booster
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 40 
## niter: 500
## nfeatures : 40 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 99     500         5 0.1  0.01              0.5                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_full_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_full_8020$finalModel, top_n = 20)

XGBoost Result

xgb_result

Extreme Value Models

EVGAM

# for this part, I will fit from Jan-2021 to April-2022
ev_data <- daily_full %>%
  filter(day >= '2021-01-01',  day < '2022-05-01') %>%
  mutate(month = as.numeric(month),
         weekday = as.character(weekday),
         day = as.numeric(day),
         dist_wrp = I(1/dist_wrp^2),
         dist_dc = I(1/dist_dc^2),
         mon_utm_x = I(mon_utm_x/10^3), 
         mon_utm_y = I(mon_utm_y/10^3))
daily_max_gev <- list(H2S_daily_max ~ s(month,bs='cc') + year, 
                 ~ s(month,bs='cc') + year +
                   weekday + wd_avg + ws_avg + daily_downwind_ref + 
                   capacity + dist_wrp + 
                   s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +  
                   te(mon_utm_x, mon_utm_y, day, 
                      k=c(10,10),d=c(2,1),bs=c('tp','cc')) + 
                   monthly_oil_1km + monthly_gas_1km + active_1km + 
                   daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                   dist_dc + avg_temp + avg_hum + precip, 
                 ~ 1)

dm_gev <- readRDS('rfiles/dm_gev.rds')
# dm_gev <- evgam(daily_max_gev, ev_data, family = "gev")
# saveRDS(dm_gev, 'rfiles/dm_gev.rds')
summary(dm_gev)
## 
## ** Parametric terms **
## 
## location
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     1.39       0.02   57.56   <2e-16
## year2022        0.14       0.06    2.39  0.00846
## 
## logscale
##                     Estimate Std. Error       t value Pr(>|t|)
## (Intercept)            -6.61       1.78 -3.720000e+00 9.93e-05
## year2022                0.49       0.14  3.560000e+00 0.000184
## weekdayMon              0.05       0.05  9.800000e-01    0.162
## weekdaySat             -0.01       0.05 -1.900000e-01    0.424
## weekdaySun              0.00       0.05 -4.000000e-02    0.484
## weekdayThu              0.11       0.05  2.170000e+00    0.015
## weekdayTue              0.08       0.05  1.630000e+00   0.0516
## weekdayWed              0.03       0.05  5.100000e-01    0.303
## wd_avg                  0.00       0.00  2.300000e+00   0.0108
## ws_avg                  0.01       0.01  6.000000e-01    0.273
## daily_downwind_ref      0.07       0.05  1.400000e+00   0.0813
## capacity                0.00       0.00 -2.990000e+00   0.0014
## dist_wrp            45424.75       0.00  2.423312e+10   <2e-16
## monthly_oil_1km         0.00       0.00  7.040000e+00 9.96e-13
## monthly_gas_1km         0.00       0.00 -1.058000e+01   <2e-16
## active_1km              0.03       0.03  1.050000e+00    0.148
## daily_downwind_wrp     -0.03       0.06 -5.200000e-01    0.303
## elevation              -0.03       0.01 -3.160000e+00 0.000798
## EVI                    -0.46       0.15 -3.010000e+00  0.00131
## num_odor_complaints     0.05       0.01  8.660000e+00   <2e-16
## dist_dc              3098.92       0.00  2.100542e+07   <2e-16
## avg_temp                0.01       0.00  1.300000e+00   0.0962
## avg_hum                 0.00       0.00 -1.930000e+00   0.0267
## precip                 -0.06       0.08 -7.400000e-01     0.23
## 
## shape
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     0.21       0.02   11.44   <2e-16
## 
## ** Smooth terms **
## 
## location
##          edf max.df Chi.sq Pr(>|t|)
## s(month) 6.2      8 375.69   <2e-16
## 
## logscale
##                               edf max.df Chi.sq Pr(>|t|)
## s(month)                     7.80      8 356.16   <2e-16
## s(mon_utm_x,mon_utm_y)       5.52      9  27.63 4.12e-05
## te(mon_utm_x,mon_utm_y,day) 54.02     80 290.64   <2e-16
plot(dm_gev)

MGCV

m1 <- readRDS('rfiles/m1.rds')
# m1 <- gam(list(H2S_daily_max ~ s(month,bs='cc') + year,
#                ~ s(month,bs='cc') + year +
#                    weekday + wd_avg + ws_avg + daily_downwind_ref +
#                    capacity + dist_wrp +
#                    s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
#                    te(mon_utm_x, mon_utm_y, day,
#                       k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                    monthly_oil_1km + monthly_gas_1km + active_1km +
#                    daily_downwind_wrp + elevation + EVI + num_odor_complaints +
#                    dist_dc + avg_temp + avg_hum + precip,
#                ~ 1),
#           data = ev_data, method = "REML",
#           family = gevlss(link = list("identity", "identity", "identity")))
# saveRDS(m1, 'rfiles/m1.rds')
summary(m1)
## 
## Family: gevlss 
## Link function: identity identity identity 
## 
## Formula:
## H2S_daily_max ~ s(month, bs = "cc") + year
## ~s(month, bs = "cc") + year + weekday + wd_avg + ws_avg + daily_downwind_ref + 
##     capacity + dist_wrp + s(mon_utm_x, mon_utm_y, bs = "tp", 
##     k = 10) + te(mon_utm_x, mon_utm_y, day, k = c(10, 10), d = c(2, 
##     1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints + 
##     dist_dc + avg_temp + avg_hum + precip
## ~1
## 
## Parametric coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            1.392e+00  2.397e-02  58.102  < 2e-16 ***
## year2022               1.361e-01  5.650e-02   2.408 0.016032 *  
## (Intercept).1         -6.973e+00  1.758e+00  -3.966 7.30e-05 ***
## year2022.1             4.965e-01  1.393e-01   3.565 0.000364 ***
## weekdayMon.1           5.013e-02  5.205e-02   0.963 0.335492    
## weekdaySat.1          -9.562e-03  4.864e-02  -0.197 0.844165    
## weekdaySun.1          -3.141e-03  4.897e-02  -0.064 0.948862    
## weekdayThu.1           1.112e-01  5.116e-02   2.173 0.029786 *  
## weekdayTue.1           8.063e-02  5.002e-02   1.612 0.106935    
## weekdayWed.1           2.570e-02  5.172e-02   0.497 0.619250    
## wd_avg.1               4.441e-04  1.908e-04   2.328 0.019930 *  
## ws_avg.1               5.815e-03  1.002e-02   0.580 0.561639    
## daily_downwind_ref.1   6.469e-02  4.793e-02   1.350 0.177101    
## capacity.1            -1.406e-03  5.051e-04  -2.785 0.005359 ** 
## dist_wrp.1            -3.640e+05  1.663e+06  -0.219 0.826789    
## monthly_oil_1km.1      8.399e-04  1.148e-04   7.313 2.62e-13 ***
## monthly_gas_1km.1     -2.707e-03  2.570e-04 -10.533  < 2e-16 ***
## active_1km.1           2.400e-02  2.542e-02   0.944 0.345128    
## daily_downwind_wrp.1  -3.218e-02  6.327e-02  -0.509 0.611053    
## elevation.1           -3.246e-02  1.071e-02  -3.031 0.002436 ** 
## EVI.1                 -4.050e-01  4.274e-01  -0.947 0.343434    
## num_odor_complaints.1  5.175e-02  5.954e-03   8.692  < 2e-16 ***
## dist_dc.1              3.003e+03  5.827e+02   5.154 2.55e-07 ***
## avg_temp.1             6.181e-03  4.631e-03   1.335 0.182010    
## avg_hum.1             -2.800e-03  1.439e-03  -1.946 0.051672 .  
## precip.1              -5.810e-02  7.946e-02  -0.731 0.464715    
## (Intercept).2          2.061e-01  1.778e-02  11.589  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                  edf Ref.df Chi.sq p-value    
## s(month)                       5.946  8.000 531.90 < 2e-16 ***
## s.1(month)                     7.743  8.000 372.15 < 2e-16 ***
## s.1(mon_utm_x,mon_utm_y)       4.972  5.554  22.42 0.00041 ***
## te.1(mon_utm_x,mon_utm_y,day) 53.079 80.000 380.71 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Deviance explained =   NA%
## -REML = 7658.9  Scale est. = 1         n = 3904
plot(m1, pages = 1, scheme = 1, scale = 0, seWithMean = TRUE)